Inside of this project we will observe data from both Craigslist postings in Davis, CA and Berkeley, CA. Since these are both cities that have a well-known college and are also local, I wanted to explore the difference in pricing in living accommodations and see just how different the average living situations would be for people living in both cities.
Since Craigslist has changed their posts to dynamically update I will be downloading all of the HTML files for 360 posts for both Davis and Berkeley. Eight of the posts for Davis were invalid (didn’t contain enough information on them to be extracted) and we will use these 352 posts to compare with the 360 from Berkeley.
These functions are for extracting the information from a full craigslist apartment post.
orNA <- function(x) {
if (length(x) == 0) return(NA)
return(x)
}
procPost =
function(url, html = readLines(url), doc = htmlParse(html))
{
# Can get title from breadcrumb json, <meta property="og:title"> or <title> in <head>
title = xpathSApply(doc, "/html/head/title", xmlValue)
title = gsub("- apts/housing for rent.*", "", title)
json = xpathSApply(doc, "/html/head/script[@id = 'ld_posting_data']", xmlValue)
meta = fromJSON(json)
# The address is not a single value and may vary from post to post. Additionally, some don't have any so this will extract the value and remove it from meta and store it in a seperate variable if there is an address and will just simply set address to NA if its not.
i = match("address", names(meta))
if (!is.na(i)) {
address = meta[[i]]
meta = meta[-i]
} else {
address = NA
}
attrs = c("parking", "laundry", "pets_dog", "pets_cat", "rent_period",
"airconditioning", "application_fee_explained", "no_smoking", "wheelaccess")
info = lapply(attrs, getAttrValue, doc)
names(info) = attrs
body = getNodeSet(doc, "//section[@id = 'postingbody']")
body = xmlValue(body[[1]])
datePosted = getNodeSet(doc, "//div[@class = 'postinginfos']//time[contains(@class, 'timeago')]/@datetime")
rent <- orNA(xpathSApply(doc, "//span[@class='price']", xmlValue)[1])
specs_raw <- xpathSApply(doc, "//span[@class='housing']", xmlValue)
# Try to extract sqft from the specs of the post
sqft_match <- regmatches(specs_raw, regexpr("[0-9]+\\s*ft2", specs_raw))
sqft <- orNA(as.numeric(gsub("\\s*ft2", "", sqft_match)))
ans = cbind(data.frame(body = body,
title = title,
rent = rent,
Sqft = sqft,
datePosted = orNA(datePosted[[1]])
),
as.data.frame(meta),
as.data.frame(info))
#if("smokingAllowed" %in% names(ans)) browser()
ans$address = list(address)
ans
}
getAttrValue =
function(what, doc)
{
ans = xpathSApply(doc, sprintf("//div[@class = 'attrgroup']//div[contains(@class, '%s')]/span[@class = 'valu']", what), xmlValue, trim = TRUE)
if(length(ans) == 0)
ans = xpathSApply(doc, sprintf("//div[@class = 'attrgroup']//div[@class = 'attr' and not(div[contains(@class, '%s')])]/span[@class = 'valu']/a[contains(@href, '%s')]", what, what), xmlValue, trim = TRUE)
orNA(ans)
}
# Load all .html files from a folder
loadCraigslistFolder <- function(folder) {
files <- list.files(folder, full.names = TRUE, pattern = "\\.html$")
htmlDocs <- lapply(files, function(f) {
doc <- htmlParse(f, encoding = "UTF-8")
procPost(f, html = readLines(f), doc = doc)
})
do.call(rbind.fill, htmlDocs)
}
Now we will run these functions on both our Davis data (labeled “Sacramento Data” since that is how Craigslist is identifying the area surrounding Davis) and also our San Francisco data.
dirtySacramentoData <- loadCraigslistFolder("craigslist_sacramento")
dirtySfData <- loadCraigslistFolder("craigslist_berkeley")
However we will now need to clean this data since the format and data types don’t come ready for analysis in R.
cleanData <- function(df) {
#This first part of the function is to get rid of the columns that are automatically created when you call as.data.frame() on a listed that includes nested elements, but it does not contain any valuable data for our analysis.
df$X.context <- NULL
# Renaming for clarity
names(df)[names(df) == "X.type"] <- "Type"
# Converting all appropriate columns to numeric for analysis
df$numberOfBedrooms <- as.numeric(df$numberOfBedrooms)
df$numberOfBathroomsTotal <- as.numeric(df$numberOfBathroomsTotal)
df$Sqft <- as.numeric(df$Sqft)
df$rent <- as.numeric(gsub("[$,]", "", df$rent))
df$latitude <- as.numeric(df$latitude)
df$longitude <- as.numeric(df$longitude)
df$petsAllowed[is.na(df$petsAllowed)] <- TRUE
df$hasAC <- !is.na(df$airconditioning)
return(df)
}
sacramentoData <- cleanData(dirtySacramentoData)
sfData <- cleanData(dirtySfData)
In order to view plots to draw insights we will first needs to change these columns to the proper class (numeric in the case of number of bedrooms/bathrooms). Then we can verify if our data is valid by looking if these values of bedrooms/bathrooms make sense.
sacBeds <- ggplot(sacramentoData, aes(x = numberOfBedrooms)) +
geom_histogram(binwidth = 1, fill = "red", color = "black") +
labs(title = "Histogram of Number of Bedrooms in Davis",
x = "Number of Bedrooms",
y = "Frequency") +
theme_minimal() +
theme(plot.title = element_text(size = 11))
sacBaths <- ggplot(sacramentoData, aes(x = numberOfBathroomsTotal)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black") +
labs(title = "Histogram of Number of Bathrooms in Davis",
x = "Number of Bathrooms",
y = "Frequency") +
theme_minimal() +
theme(plot.title = element_text(size = 11))
sfBeds <- ggplot(sfData, aes(x = numberOfBedrooms)) +
geom_histogram(binwidth = 1, fill = "red", color = "black") +
labs(title = "Histogram of Number of Bedrooms in Berkeley",
x = "Number of Bedrooms",
y = "Frequency") +
theme_minimal() +
theme(plot.title = element_text(size = 11))
sfBaths <- ggplot(sfData, aes(x = numberOfBathroomsTotal)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black") +
labs(title = "Histogram of Number of Bathrooms in Berkeley",
x = "Number of Bathrooms",
y = "Frequency") +
theme_minimal() +
theme(plot.title = element_text(size = 11))
grid.arrange(sacBeds, sacBaths, sfBeds, sfBaths, nrow = 2)
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
To start, values seem to make sense and there is no values that we
wouldn’t expect to be here so we can assume that we scraped the data
correctly. Now it seems that most of the housing in Berkeley seems to be
comparatively smaller since the data on bedrooms is skewed right (there
is more often a smaller number of bedrooms). This also coincides with
the number of bathrooms since smaller living situations typically have
one bathroom, and one is the mode for the number of bathrooms in
Berkeley (most common value is 1). This is also true for Davis, but you
can see that the frequency of having two or more bathrooms in Davis is
over double that of Berkeley.
sacRentPlot <- ggplot(sacramentoData, aes(x = Sqft, y = rent)) +
geom_point(color = "blue") +
labs(title = "Davis : Rent vs. Square Footage",
x = "Square Footage (sqft)",
y = "Rent ($)") +
ylim(0, 14000) + #Chose 14,000 for both of them here since $14,000 is the biggest value in both data sets
xlim(0,3700) + #Similar sort of logic for 3,700 here.
theme_minimal()
sfRentPlot <- ggplot(sfData, aes(x = Sqft, y = rent)) +
geom_point(color = "darkgreen") +
labs(title = "Berkeley : Rent vs. Square Footage",
x = "Square Footage (sqft)",
y = "Rent ($)") +
ylim(0, 14000) +
xlim(0,3700) +
theme_minimal()
ggplotly(sacRentPlot)
ggplotly(sfRentPlot)
Using the above plot we can quickly see that the rent is generally going to be higher in Berkeley, but I am going to also conduct a statistical analysis to observe if this difference is significant.
I want to first observe if there is a difference in square foot per dollar to account for bigger places costing more. In reality, a potential customer doesn’t mind a significantly larger apartment to cost more than one that is comparatively smaller. However, I want to see if the cost of living is overall more expensive in Berkeley when you properly account for the size of the apartment as well.
sacramentoData$sqftPerDollar <- sacramentoData$Sqft / sacramentoData$rent
sfData$sqftPerDollar <- sfData$Sqft / sfData$rent
# Remove NAs for analysis
sac_clean <- sacramentoData[!is.na(sacramentoData$sqftPerDollar), ]
sf_clean <- sfData[!is.na(sfData$sqftPerDollar), ]
# Labeling each with the perspective city to make it clearer.
sac_clean$city <- "Davis"
sf_clean$city <- "Berkeley"
combined <- rbind(sac_clean, sf_clean)
ggplot(combined, aes(x = city, y = sqftPerDollar, fill = city)) +
geom_boxplot() +
labs(title = "Square Footage per Dollar by City",
x = "City",
y = "Square Feet per Dollar") +
theme_minimal()
t.test(sqftPerDollar ~ city, data = combined)
##
## Welch Two Sample t-test
##
## data: sqftPerDollar by city
## t = -8.997, df = 504.81, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Berkeley and group Davis is not equal to 0
## 95 percent confidence interval:
## -0.12756783 -0.08183941
## sample estimates:
## mean in group Berkeley mean in group Davis
## 0.3013338 0.4060374
Since this p-value is approximately equal to zero (p < 0.05), we reject the null hypothesis that there is no difference in square footage per dollar between Davis and Berkeley. This result provides statistical evidence that there is a difference in the average amount of space one gets per dollar of rent between the two cities.
From our sample, the mean square footage per dollar in Davis is higher than in Berkeley, which suggests that renters in Davis tend to receive more living space for the same price. Specifically, Davis listings averaged 0.406 sq ft per dollar, while Berkeley listings averaged 0.301 sq ft per dollar.
pal <- colorNumeric(palette = "YlOrRd", domain = sacramentoData$rent)
# Create the map
leaflet(data = sacramentoData) %>%
addTiles() %>%
addCircleMarkers(~longitude, ~latitude,
color = ~pal(rent),
radius = 5,
stroke = FALSE,
fillOpacity = 0.7,
popup = ~paste0("<strong>", title, "</strong><br>",
"Rent: $", rent, "<br>",
"Sqft: ", Sqft)) %>%
addLegend("bottomright", pal = pal, values = ~rent,
title = "Rent ($)",
opacity = 1)
leaflet(data = sfData) %>%
addTiles() %>%
addCircleMarkers(~longitude, ~latitude,
color = ~pal(rent),
radius = 5,
stroke = FALSE,
fillOpacity = 0.7,
popup = ~paste0("<strong>", title, "</strong><br>",
"Rent: $", rent, "<br>",
"Sqft: ", Sqft)) %>%
addLegend("bottomright", pal = pal, values = ~rent,
title = "Rent ($)",
opacity = 1)
## Warning in pal(rent): Some values were outside the color scale and will be
## treated as NA
## Warning in pal(rent): Some values were outside the color scale and will be
## treated as NA
## Warning in pal(c(r[1], cuts, r[2])): Some values were outside the color scale
## and will be treated as NA
As you can see the woodland housing accommodations are generally much cheaper due to their farther distance from the university. Additionally, we can observe there is certainly a premium that the customer pays when living closer to Berkeley, but there is simply a larger amount of apartments in the orange color than Davis (in the approximately $3,000 per month range).
Now we will observe if housing accommodations that allow pets are more expensive. Generally it is believed that pet-friendly living is more expensive since the demand for this is high and therefore that drives up the typical cost. However, since Davis is a college tower and there is less pets I want to observe if there is any difference in the price of living for pet-friendly vs. non pet-friendly housing. Additionally, in this analysis if the posting didn’t have an explicit mentioning of “no pets allowed” then
# First, keep only rows where rent is available
petDataSac <- sacramentoData[!is.na(sacramentoData$rent), ]
# Create petsAllowed as a factor (TRUE if explicitly allows pets, FALSE if explicitly says no pets)
petDataSac$petsAllowed <- ifelse(!is.na(petDataSac$pets_cat) | !is.na(petDataSac$pets_dog), TRUE, FALSE)
# Make sure it's treated as a factor
petDataSac$petsAllowed <- factor(petDataSac$petsAllowed, levels = c(FALSE, TRUE))
table(petDataSac$petsAllowed)
##
## FALSE TRUE
## 22 331
ggplot(petDataSac, aes(x = petsAllowed, y = rent, fill = petsAllowed)) +
geom_boxplot() +
labs(title = "Rent Comparison: Pet-Friendly vs. Not",
x = "Pets Allowed",
y = "Rent ($)") +
theme_minimal()
aggregate(rent ~ petsAllowed, data = petDataSac, FUN = mean)
## petsAllowed rent
## 1 FALSE 2159.182
## 2 TRUE 2328.408
t.test(rent ~ petsAllowed, data = petDataSac)
##
## Welch Two Sample t-test
##
## data: rent by petsAllowed
## t = -1.2759, df = 25.868, p-value = 0.2133
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -441.9299 103.4778
## sample estimates:
## mean in group FALSE mean in group TRUE
## 2159.182 2328.408
Even though pet-friendly units in Davis have a slightly higher average rent ($2159.18 vs. $2328.41), the difference is small and not statistically significant. Based on this sample, we cannot conclude that allowing pets affects rent.
I will now observe Berkeley and see if the conclusion is any different.
#Same logic as before
petDataSF <- sfData[!is.na(sfData$rent), ]
petDataSF$petsAllowed <- ifelse(!is.na(petDataSF$pets_cat) | !is.na(petDataSF$pets_dog), TRUE, FALSE)
petDataSF$petsAllowed <- factor(petDataSF$petsAllowed, levels = c(FALSE, TRUE))
table(petDataSF$petsAllowed)
##
## FALSE TRUE
## 96 264
ggplot(petDataSF, aes(x = petsAllowed, y = rent, fill = petsAllowed)) +
geom_boxplot() +
labs(title = "Berkeley: Rent Comparison - Pet-Friendly vs. Not",
x = "Pets Allowed",
y = "Rent ($)") +
theme_minimal()
aggregate(rent ~ petsAllowed, data = petDataSF, FUN = mean)
## petsAllowed rent
## 1 FALSE 2755.521
## 2 TRUE 2599.523
t.test(rent ~ petsAllowed, data = petDataSF)
##
## Welch Two Sample t-test
##
## data: rent by petsAllowed
## t = 0.73013, df = 121.93, p-value = 0.4667
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -266.9636 578.9598
## sample estimates:
## mean in group FALSE mean in group TRUE
## 2755.521 2599.523
There is a different conclusion with the city of Berkeley, where if pets are allowed the average price of rent is $2599.52 and if they aren’t allowed the rent is on average $2755.52. This is likely true since we didn’t account for things such as prestige of living situations and some living situations that present themselves as luxurious would likely not allow pets, but would still be willing to charge you premium rent.
Now I’m interested in testing if having air conditioning on your posting will result in higher rent prices on average.
ggplot(sacramentoData, aes(x = hasAC, y = rent, fill = hasAC)) +
geom_boxplot() +
labs(title = "Rent by Air Conditioning Availability",
x = "Has Air Conditioning",
y = "Rent ($)") +
theme_minimal()
aggregate(rent ~ hasAC, data = sacramentoData, FUN = mean, na.rm = TRUE)
## hasAC rent
## 1 FALSE 2226.621
## 2 TRUE 2347.703
t.test(rent ~ hasAC, data = sacramentoData)
##
## Welch Two Sample t-test
##
## data: rent by hasAC
## t = -1.6892, df = 255.64, p-value = 0.09241
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -262.24524 20.08061
## sample estimates:
## mean in group FALSE mean in group TRUE
## 2226.621 2347.703
The p-value for the t-test is greater than 0.05 (0.09241 > 0.05) so we can conclude there isn’t significant evidence to conclude that AC has a real effect on the amount rent. Units with AC do on average have a higher rent ($2347.70 vs. $2226.62), however the difference is not statistically significant.
ggplot(sfData, aes(x = hasAC, y = rent, fill = hasAC)) +
geom_boxplot() +
labs(title = "Berkeley: Rent by Air Conditioning Availability",
x = "Has Air Conditioning",
y = "Rent ($)") +
theme_minimal()
aggregate(rent ~ hasAC, data = sfData, FUN = mean, na.rm = TRUE)
## hasAC rent
## 1 FALSE 2597.611
## 2 TRUE 2831.403
t.test(rent ~ hasAC, data = sfData)
##
## Welch Two Sample t-test
##
## data: rent by hasAC
## t = -1.4825, df = 134.33, p-value = 0.1406
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -545.70071 78.11659
## sample estimates:
## mean in group FALSE mean in group TRUE
## 2597.611 2831.403
There is once again not enough statistical evidence to conclude that having air conditioning significantly affects the price of rent in this Berkeley data set. However, in this sample the average rent was $2831.40 vs. $2597.61, which indicates there was still an average difference of about $234.
Now finally I want to answer the question “Which features are most predictive of rent?”. Will do this by constructing a model and viewing what predictors are contributing the most to our model. We will first take a look at the Davis data and then look at our Berkeley data to see if there is any difference in what contributes to an increase in rent.
modelSac <- lm(rent ~ Sqft + numberOfBedrooms + numberOfBathroomsTotal + petsAllowed + hasAC + parking + laundry, data = sacramentoData)
summary(modelSac)
##
## Call:
## lm(formula = rent ~ Sqft + numberOfBedrooms + numberOfBathroomsTotal +
## petsAllowed + hasAC + parking + laundry, data = sacramentoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2038.72 -204.14 8.62 181.54 1641.37
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 648.2353 216.4731 2.995 0.003014 **
## Sqft 1.2970 0.1875 6.917 3.57e-11 ***
## numberOfBedrooms 152.6709 51.8242 2.946 0.003512 **
## numberOfBathroomsTotal -120.4971 59.8898 -2.012 0.045253 *
## petsAllowedTRUE NA NA NA NA
## hasACTRUE 289.5197 80.3033 3.605 0.000374 ***
## parkingcarport 253.1707 134.5101 1.882 0.060930 .
## parkingdetached garage -237.9578 402.8213 -0.591 0.555216
## parkingoff-street parking -179.8169 143.1972 -1.256 0.210342
## parkingstreet parking 31.4479 270.5445 0.116 0.907553
## laundrylaundry on site 48.0866 131.7206 0.365 0.715359
## laundryno laundry on site 44.6844 467.4922 0.096 0.923925
## laundryw/d hookups -255.3812 214.4740 -1.191 0.234844
## laundryw/d in unit 96.6656 107.6945 0.898 0.370234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 377.7 on 260 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.7789, Adjusted R-squared: 0.7687
## F-statistic: 76.35 on 12 and 260 DF, p-value: < 2.2e-16
Observing our column in the table above labeled “Estimate” will show us which of these predictors have the most impact on the price of rent. For example, if we observe the “numberOfBedrooms” variable we can see that for every additional bedroom adds roughly $152.67 to the rent and that logic can be applied to all of our other variables. It is also important to note that these values are looking at the increase/decreas in rent when we hold all other variables constant, whereas before the T-test we are conducting didn’t. We can then see that the largest contributors to the increase of the rent are larger square footage ($1.30 per every additional square ft.), having AC (which increases the rent by $289.52) and having a car port, when compared to having an attached garage or street/off-street parking, costs roughly $253.17 more per month. Additionally, this model suggests that having an apartment with just hookups for a washer/dryer (not having the machines themselves and being required to bring your own) is going to result in a rent that is $255.38 cheaper with every other variable being held constant. This means that when compared to all the other variables, if someone were to want apartments with cheaper rent in Davis they should look for places that only have washer/dryer hookups and purchase their own.
modelSf <- lm(rent ~ Sqft + numberOfBedrooms + numberOfBathroomsTotal + petsAllowed + hasAC + parking + laundry, data = sfData)
summary(modelSf)
##
## Call:
## lm(formula = rent ~ Sqft + numberOfBedrooms + numberOfBathroomsTotal +
## petsAllowed + hasAC + parking + laundry, data = sfData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2164.67 -376.03 -27.97 333.52 2872.77
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 118.8600 179.9895 0.660 0.509931
## Sqft 2.9099 0.2851 10.207 < 2e-16 ***
## numberOfBedrooms 365.6463 119.1951 3.068 0.002520 **
## numberOfBathroomsTotal -445.8891 128.5240 -3.469 0.000665 ***
## petsAllowedTRUE NA NA NA NA
## hasACTRUE 202.4131 130.2453 1.554 0.122067
## parkingcarport 269.6585 148.9439 1.810 0.072031 .
## parkingdetached garage -266.2639 180.5343 -1.475 0.142143
## parkingno parking -105.2348 411.6542 -0.256 0.798546
## parkingoff-street parking -91.9404 148.4187 -0.619 0.536459
## parkingstreet parking -178.2395 176.6905 -1.009 0.314555
## laundrylaundry on site 112.8019 176.0858 0.641 0.522661
## laundryno laundry on site 189.3900 324.0855 0.584 0.559756
## laundryw/d hookups -420.0608 300.4113 -1.398 0.163893
## laundryw/d in unit 305.3260 130.5047 2.340 0.020496 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 673.5 on 166 degrees of freedom
## (180 observations deleted due to missingness)
## Multiple R-squared: 0.7818, Adjusted R-squared: 0.7647
## F-statistic: 45.74 on 13 and 166 DF, p-value: < 2.2e-16